library(polars)
## Warning: package 'polars' was built under R version 4.3.2
library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.2
## Warning: package 'dplyr' was built under R version 4.3.2
library(microbenchmark)
## Warning: package 'microbenchmark' was built under R version 4.3.2

microbenchmark

7mb file benchmark

perf_7mb_file <- microbenchmark(
               tidy = readr::read_csv("bse_compiled_2023_04_13.csv"),
               polars = pl$read_csv("bse_compiled_2023_04_13.csv")
)

perf_7mb_file
autoplot(perf_7mb_file)

7mb df benchmark

Reading as Polars and then converting to tidy data frame

perf_7mb_df <- microbenchmark(
               tidy = readr::read_csv("bse_compiled_2023_04_13.csv"),
               polars = pl$read_csv("bse_compiled_2023_04_13.csv") |> as.data.frame()
)

perf_7mb_df
autoplot(perf_7mb_df)

7mb head benchmark

Reading as Polars and then converting to tidy data frame and taking head

perf_testing_fn <- function(data_file){
  perf_data <<- microbenchmark(
               tidy = readr::read_csv(data_file) %>% head(n=10),
               polars = pl$read_csv(data_file) |> as.data.frame() %>%
               head(n=10),
               polars_head = pl$read_csv(data_file)$head(n=10) |>
               as.data.frame(),
               polars_lazy_head = pl$scan_csv(data_file)$head(n=10)$collect(),
               polars_lazy_head_df = pl$scan_csv(data_file)$head(n=10) |> as.data.frame()
  )
  return( perf_data)
}
perf_testing_fn("bse_compiled_2023_04_13.csv")
autoplot(perf_data)

87mb head benchmark

Reading as Polars and then converting to tidy data frame and taking head

perf_testing_fn("T_ONTIME_REPORTING.csv")
autoplot(perf_data)

FilterGroupAgg

Data

pl$read_csv("T_ONTIME_REPORTING.csv") |> as.data.frame()
pl$read_csv("T_ONTIME_REPORTING.csv")$
  filter(pl$col("ORIGIN") == "JFK")$
  group_by("ORIGIN_CITY_NAME")$
  agg(pl$col("DEST_CITY_NAME")$n_unique())
shape: (1, 2)
ORIGIN_CITY_NAMEDEST_CITY_NAME
stru32
"New York, NY"64
pl$read_csv("T_ONTIME_REPORTING.csv")$
  filter(pl$col("ORIGIN") == "JFK")$
  group_by(c("ORIGIN_CITY_NAME","DEST_CITY_NAME"))$
  agg(pl$len()$alias("counts"))
shape: (64, 3)
ORIGIN_CITY_NAMEDEST_CITY_NAMEcounts
strstru32
"New York, NY""Indianapolis, …163
"New York, NY""Orlando, FL"404
"New York, NY""Denver, CO"131
"New York, NY""Minneapolis, M…124
"New York, NY""San Antonio, T…31
"New York, NY""Los Angeles, C…814
"New York, NY""Dallas/Fort Wo…174
"New York, NY""Washington, DC…330
"New York, NY""Milwaukee, WI"62
"New York, NY""Eagle, CO"31
"New York, NY""Aguadilla, PR"62
"New York, NY""Burbank, CA"31
"New York, NY""Norfolk, VA"151
"New York, NY""Chicago, IL"261
"New York, NY""Cincinnati, OH…110
"New York, NY""Seattle, WA"210
"New York, NY""Jacksonville, …157
"New York, NY""Palm Springs, …18
"New York, NY""Baltimore, MD"79
"New York, NY""San Juan, PR"283
………
"New York, NY""Tampa, FL"222
"New York, NY""Atlanta, GA"301
"New York, NY""Salt Lake City…187
"New York, NY""Portland, ME"103
"New York, NY""Boston, MA"532
"New York, NY""Columbus, OH"182
"New York, NY""Nashville, TN"171
"New York, NY""Burlington, VT…171
"New York, NY""San Francisco,…556
"New York, NY""Austin, TX"190
"New York, NY""San Diego, CA"173
"New York, NY""Worcester, MA"92
"New York, NY""Las Vegas, NV"246
"New York, NY""Richmond, VA"79
"New York, NY""Detroit, MI"206
"New York, NY""Syracuse, NY"141
"New York, NY""Charlotte, NC"276
"New York, NY""Buffalo, NY"282
"New York, NY""Santa Ana, CA"31
"New York, NY""West Palm Beac…222
# `pl$count()` is deprecated and will be removed in 0.15.0. Use `pl$len()`
read_csv("T_ONTIME_REPORTING.csv") %>% 
  filter(ORIGIN == "JFK") %>% 
  group_by(ORIGIN_CITY_NAME,DEST_CITY_NAME) %>% 
  summarise(counts = n())

FilterGroup Benchmark

perf_mainpulation <- microbenchmark(
          polars = pl$read_csv("T_ONTIME_REPORTING.csv")$
                    filter(pl$col("ORIGIN") == "JFK")$
                    group_by(c("ORIGIN_CITY_NAME","DEST_CITY_NAME"))$
                    agg(pl$len()$alias("counts")),
          
          polars_df = pl$read_csv("T_ONTIME_REPORTING.csv")$
                    filter(pl$col("ORIGIN") == "JFK")$
                    group_by(c("ORIGIN_CITY_NAME","DEST_CITY_NAME"))$
                    agg(pl$len()$alias("counts")) |> as.data.frame(),
          
          polars_lazy = pl$scan_csv("T_ONTIME_REPORTING.csv")$
                    filter(pl$col("ORIGIN") == "JFK")$
                    group_by(c("ORIGIN_CITY_NAME","DEST_CITY_NAME"))$
                    agg(pl$len()$alias("counts"))$collect(),
          
          polars_lazy_df = pl$read_csv("T_ONTIME_REPORTING.csv")$
                    filter(pl$col("ORIGIN") == "JFK")$
                    group_by(c("ORIGIN_CITY_NAME","DEST_CITY_NAME"))$
                    agg(pl$len()$alias("counts")) |> as.data.frame(),
          
          tidy = read_csv("T_ONTIME_REPORTING.csv") %>% 
                  filter(ORIGIN == "JFK") %>% 
                  group_by(ORIGIN_CITY_NAME,DEST_CITY_NAME) %>% 
                  summarise(counts = n())
)

perf_mainpulation
autoplot(perf_mainpulation)

profvis

library(profvis)

7mb file

profvis({
  pl$read_csv("bse_compiled_2023_04_13.csv") |>
    as.data.frame()
})
an image caption Source: profvis data
an image caption Source: profvis data
profvis({
  read_csv("bse_compiled_2023_04_13.csv")
})
an image caption Source: profvis data
an image caption Source: profvis data

87mb file

profvis({
  pl$read_csv("T_ONTIME_REPORTING.csv") |>
    as.data.frame()
})
profvis data
profvis data
profvis({
  read_csv("T_ONTIME_REPORTING.csv")
})
profvis data
profvis data
profvis({
  pl$read_csv("T_ONTIME_REPORTING.csv")$
                    filter(pl$col("ORIGIN") == "JFK")$
                    group_by(c("ORIGIN_CITY_NAME","DEST_CITY_NAME"))$
                    agg(pl$len()$alias("counts"))
})
profvis data
profvis data
profvis({
  pl$read_csv("T_ONTIME_REPORTING.csv")$
                    filter(pl$col("ORIGIN") == "JFK")$
                    group_by(c("ORIGIN_CITY_NAME","DEST_CITY_NAME"))$
                    agg(pl$len()$alias("counts")) |>
  as.data.frame()
})
profvis data
profvis data

tidy approach

profvis({
  read_csv("T_ONTIME_REPORTING.csv") %>% 
                  filter(ORIGIN == "JFK") %>% 
                  group_by(ORIGIN_CITY_NAME,DEST_CITY_NAME) %>% 
                  summarise(counts = n())
})
profvis data
profvis data